/*** TEST SCORE RANK-RANK DISTRIBUTIONS ***/

set more 1
clear all

do "paths.do"

capture log close
log using "$LOGFILES\track_mob_1.log",replace

*************************************************
*** section 1: tracking measures ***
*************************************************

** tracking based on actual tracking exposure
use "$WORKING\tracking_7", clear
keep if subj == "math"
gen spc = schsz / numcls
la var spc "students per class"
keep campus grade year ztrack* spc
gen trk_unadj = ztrackR
gen trk_rel = (ztrackR - ztrackRavg) / (ztrackRavg_max - ztrackRavg)
destring campus grade year, replace
keep campus grade year trk_* spc
save "$WORKING\temp/track_mob_1_0.dta", replace


*************************************************
*** section 2: initial math test scores ***
*************************************************

** Get first available math score across grades 3-7 for every student
use "$WORKING/tracking_1.dta", clear
keep id2 year Mgrade Mscore
keep if (Mscore != .) & (Mgrade != .) & (Mgrade < 8) & (id2 != "")
sort id2 year Mgrade
by id2 year Mgrade: gen tmp1 = _N
assert (tmp1 == 1)
by id2: gen tmp2 = _n
keep if (tmp2 == 1)
drop tmp*

** merge in dataset with mean/sd by grade-year and calculate z-scores
gen test_grade = Mgrade
gen test_year = year
merge m:1 test_grade test_year using "$WORKING\tracking_1-meansd.dta"
drop if _merge == 2
assert (_merge == 3)
gen zscore_init = (Mscore - score_mean) / score_sd
rename Mgrade grade_init
rename year year_init
tab grade_init year_init, m
keep id2 zscore_init grade_init year_init
save "$WORKING/temp/track_mob_1_1.dta", replace
tab grade_init year_init

*************************************************
*** section 2: creating analysis sample ***
*************************************************

** base sample includes students without focal math classes (tracking_6_NOMATH)
** as well as students with focal math classes (tracking_6-math)

** for those without focal math classes, need to add some info that is added
** later in tracking_6 than when tracking_6_NOMATH is created
use "$WORKING/tracking_6_NOMATH.dta", clear
keep if Snomath==1

** physical districts for charters
destring campus, replace
merge m:1 campus year using "$WORKING/tracking_6_geo_distnum.dta"
keep if (_merge != 2)
drop _merge
gen distnum = floor(campus / 1000)
gen distnum0 = distnum
replace distnum0 = geo_distnum if (Ct_chrtd == 1) | (Ct_chrtc == 1) | (Ct_chrtu == 1)

** lagged z-score
gen test_grade = grade - 1
gen test_year = year - 1
merge m:1 test_grade test_year using "$WORKING\tracking_1-meansd.dta"
drop if _merge == 2
assert (_merge == 3) | (grade == 9) | (grade == 3)
drop _merge test_grade test_year
la var score_mean "mean score on prev-grade prev-year test"
la var score_sd "sd of scores on prev-grade prev-year test"
gen zLMscore = (LMscore - score_mean) / score_sd
la var zLMscore "normalized lagged `ss' score"
	
keep id2 distnum0 campus grade year Snomath Sclass_id Mscore LMscore LMgrade zLMscore D* Mcd* Pdisadv
order id2 distnum0 campus grade year Snomath Sclass_id Mscore LMscore LMgrade zLMscore D* Mcd* Pdisadv
gen over8 = .
gen under8 = .
save "$WORKING/temp/track_mob_1_2a.dta", replace


use "$WORKING/tracking_6-math.dta", clear
keep id2 distnum0 campus grade year Snomath Sclass_id Mscore LMscore LMgrade zLMscore D* Mcd* Pdisadv over8 under8
order id2 distnum0 campus grade year Snomath Sclass_id Mscore LMscore LMgrade zLMscore D* Mcd* Pdisadv over8 under8
append using "$WORKING/temp/track_mob_1_2a.dta"

** keep one obs/student per year, sorting on (if multiple observations):
**** prefer an observation that would be part of the base sample
**** prefer an observation that has a math class
**** prefer smaller campus number (arbitrary but deterministic)
gen samp0 = (grade == 4) & (LMgrade == 3) & (zLMscore != .) & (year <= 2015)
la var samp0 "4th grader w/ non-missing prior year grade 3 scores, up to 2015"
gsort +id2 +year -samp0 +Snomath +distnum0 +campus +grade
by id2 year: gen tmp1 = _n
* only one obs kept per year
keep if (tmp1 == 1)
drop tmp*
destring campus, force replace
save "$WORKING/temp/track_mob_1_2.dta", replace

** keep analysis cohorts
keep if samp0
keep if !Snomath
* allocating to first year observed if >1 case/student
sort id2 year
by id2: gen tmp1 = _n
keep if (tmp1 == 1)
drop tmp*
save "$WORKING/temp/track_mob_1_3.dta", replace

** rename and create base-period variables
rename year base_year
la var base_year "initial (grade 4) year"
rename distnum0 base_district
la var base_district "initial (grade 4) campus"
rename campus base_campus
la var base_campus "initial (grade 4) campus"
rename Sclass_id base_class
la var base_class "initial (grade 4) math class"
rename zLMscore base_zscore
la var base_zscore "initial (grade 4) math z-score"

** create within-sample score percentiles
sort base_year base_zscore
by base_year: gen tmp1 = _n - 1
by base_year: gen tmp2 = _N
gen base_pctraw = tmp1 / tmp2
la var base_pctraw "initial prior (grade 3) percentile (ignoring ties)"
egen tmp3 = max(tmp1), by(base_year base_zscore)
egen tmp4 = min(tmp1), by(base_year base_zscore)
gen tmp5 = tmp3 / tmp2
gen tmp6 = tmp4 / tmp2
gen base_pctfix = (tmp5 + tmp6) / 2
la var base_pctfix "initial prior (grade 3) percentile (fixed for ties)"
keep id2 base_year base_district base_campus base_class base_zscore base_pctraw base_pctfix
save "$WORKING/temp/track_mob_1_4.dta", replace

** make balanced 5-year (g4-g8 for typical student) panel
gen tmp1 = 5
expand tmp1
sort id2
by id2: gen rel_year = _n - 1
gen year = base_year + rel_year
drop tmp*
save "$WORKING/temp/track_mob_1_5.dta", replace

** get data for each year in panel
merge m:1 id2 year using "$WORKING/temp/track_mob_1_2.dta"
keep if (_merge != 2)
gen has_year = (_merge == 3)
drop _merge
tab rel_year Snomath


** create current math z-scores
gen test_grade = grade
gen test_year = year
merge m:1 test_grade test_year using "$WORKING\tracking_1-meansd.dta"
drop if _merge == 2
assert (_merge == 3) | (!has_year) | (grade == 9)
drop _merge test_grade test_year
gen zMscore = (Mscore - score_mean) / score_sd
la var zMscore "normalized math score"

** condense score codes to a single variable
descr Mcd*
gen mcd_str = "none"
replace mcd_str = "no information - D" if Mcdninfd
replace mcd_str = "no information - *" if Mcdninfs
replace mcd_str = "alternative standard" if Mcdastan
replace mcd_str = "absent for test" if Mcdabs
replace mcd_str = "other" if Mcdother
replace mcd_str = "LEP exempt for test" if Mcdlep
replace mcd_str = "spec ed exempt for test" if Mcdsped
replace mcd_str = "prior pass exemption" if Mcdpass
replace mcd_str = "" if !has_year
drop Mcd*
save "$WORKING/temp/track_mob_1_6.dta", replace

** class size and peer quality by class
** 2023-07-19 AYSM note: class size and peer quality only exist for students
** with focal math classes.
use "$WORKING/tracking_6-math.dta", clear
keep id2 distnum0 campus grade year Sclass_id zLMscore
* merge in initial scores
merge m:1 id2 using "$WORKING/temp/track_mob_1_1.dta"
keep if _merge != 2
drop _merge
gen one0 = 1
collapse (sum) obs_cls=one0 (mean) peer_prev=zLMscore peer_init=zscore_init ///
		(sd) sdpeer_prev=zLMscore sdpeer_init=zscore_init, ///
		by(distnum0 campus grade year Sclass_id)
destring campus, force replace

save "$WORKING/temp/track_mob_1_7.dta", replace
use "$WORKING/temp/track_mob_1_6.dta", clear
merge m:1 distnum0 campus grade year Sclass_id using "$WORKING/temp/track_mob_1_7.dta"
keep if (_merge != 2)
drop _merge

// gen district = floor(campus / 1000)
// gen base_district = floor(base_campus / 1000)
rename distnum0 district
rename *, lower
sort id2 year
save "$WORKING/temp/track_mob_1_8.dta", replace


*************************************************
*** section 4: merge and clean ***
*************************************************

** bring in tracking measures
use "$WORKING/temp/track_mob_1_8.dta", clear
merge m:1 campus grade year using "$WORKING\temp/track_mob_1_0.dta"
keep if _merge != 2
gen has_trk = (_merge == 3)
drop _merge
tab year has_trk if (campus != .) & (grade != .) & (grade >= 4), missing

rename pdisadv ddisadv

** keep only needed variables
keep id2 base_year rel_year has_year base_district base_campus base_class grade campus ///
		d* base_zscore base_pctraw base_pctfix mcd_str mscore ///
		obs_cls peer_prev peer_init sdpeer_prev sdpeer_init ///
		snomath over8 under8 trk_unadj trk_rel
save "$WORKING/track_mob_A.dta", replace

sort id2 rel_year
tab rel_year grade, missing

** reshape the data wide -- one obs per student
use "$WORKING/track_mob_A.dta", clear
drop base_* d*
reshape wide has_year grade campus mcd_str mscore ///
		obs_cls peer_prev peer_init sdpeer_prev sdpeer_init ///
		trk_unadj trk_rel over8 under8 snomath, i(id2) j(rel_year)

** don't need to keep over-level or under-level for 8th grade for years other
*** than 4 years after grade 4
drop over80 over81 over82 over83 under80 under81 under82 under83
rename over84 over8
rename under84 under8
save "$WORKING/temp/track_mob_1_9.dta", replace

use "$WORKING/track_mob_A.dta", clear
keep if (rel_year == 0)
keep id2 base_* d*
merge 1:1 id2 using "$WORKING/temp/track_mob_1_9.dta"
assert (_merge == 3)
drop _merge

save "$WORKING/track_mob_B.dta", replace

** generate enrollment and score statuses, and within-cohort percentile scores
foreach ry in 0 1 2 3 4 {
	*** enrollment statuses
	gen status_left = (grade`ry' == .)
	gen status_retained = (grade`ry' < (`ry' + 4))
	gen status_ontrack = !(status_left | status_retained)
	gen tmp1 = status_left + status_retained + status_ontrack
	assert tmp1 == 1
	drop tmp1
    
	** fill scores, THEN percentile scores
	*** for each bin in grade X, get average of scores in grade X+1. then, filled
	*** score in grade X+1 is nominal from X+1 if available and otherwise its that
	*** average.
	gen rel_fill`ry' = .
	gen mscore_fill`ry' = mscore`ry'
	replace rel_fill`ry' = 0 if (mscore`ry' != .)
	foreach years_back in 1 2 3 4 {
	    local fill_from = `ry' - `years_back'
		if (`fill_from' < 0) {
		    continue
		}
		egen tmp1 = mean(mscore`ry'), by(base_year mscore`fill_from')
		replace rel_fill`ry' = `years_back' if (mscore_fill`ry' == .) & (mscore`fill_from' != .) & (!status_left)
		replace mscore_fill`ry' = tmp1 if (mscore_fill`ry' == .) & (mscore`fill_from' != .) & (!status_left)
		drop tmp*
	}
	
	
	** generate percentile scores
	gen has_score`ry' = (mscore_fill`ry' != .)
	sort base_year has_score`ry' mscore_fill`ry'
	*** nominal scores - same score spans multiple quantile values
	by base_year has_score`ry': gen tmp1 = _n - 1
	by base_year has_score`ry': gen tmp2 = _N
	*** fixed scores - all of same score gets same/average quantile value
	egen tmp3 = max(tmp1), by(base_year has_score`ry' mscore_fill`ry')
	egen tmp4 = min(tmp1), by(base_year has_score`ry' mscore_fill`ry')
	gen tmp5 = tmp3 / tmp2
	gen tmp6 = tmp4 / tmp2
	gen mscore_fill_pct`ry' = (tmp5 + tmp6) / 2
	replace mscore_fill_pct`ry' = . if !has_score`ry'
	drop tmp*
	
	*** score statuses
	gen status0_curr = (rel_fill`ry' == 0)
	gen status0_fill1 = (rel_fill`ry' == 1)
	gen status0_fill2p = (rel_fill`ry' >= 2) & (rel_fill`ry' != .)
	gen status0_fillN = (rel_fill`ry' == .)
	gen status_curr = (status_retained | status_ontrack) & status0_curr
	gen status_fill1 = (status_retained | status_ontrack) & status0_fill1
	gen status_fill2p = (status_retained | status_ontrack) & status0_fill2p
	gen status_fillN = (status_retained | status_ontrack) & status0_fillN
	gen tmp1 = status_left + status_curr + status_fill1 + status_fill2p + status_fillN
	assert tmp1 == 1
	drop tmp1

	** status dummies
	gen stat_enrl`ry' = "none"
	foreach vv in left retained ontrack {
		replace stat_enrl`ry' = "`vv'" if (status_`vv')
	}
	gen stat_score`ry' = "none"
	foreach vv in left curr fill1 fill2p fillN {
		replace stat_score`ry' = "`vv'" if (status_`vv')
	}

	** demean tracking variables by grade
	foreach vv in unadj rel {
		egen tmp1 = mean(trk_`vv'`ry')
		gen trk_`vv'_dm`ry' = trk_`vv'`ry' - tmp1
		drop tmp*
	}
	** demean class size and peer quality by grade
	foreach vv in obs_cls peer_prev peer_init sdpeer_prev sdpeer_init {
		egen tmp1 = mean(`vv'`ry')
		gen `vv'_dm`ry' = `vv'`ry' - tmp1
		drop tmp*
	}
	drop status*
}

save "$WORKING/track_mob_C.dta", replace

log close
